Data Wrangling data
- Data wrangling is the process of cleaning and unifying complex data sets for analysis, in turn boosting productivity within an organization.
# delete duplicate
# Remove duplicate rows of the dataframe
raw_data %<>% distinct(player,.keep_all= TRUE)
# delete NA's
raw_data %<>% drop_na()
# Summarise
skim(raw_data)
Data summary
| Name |
raw_data |
| Number of rows |
481 |
| Number of columns |
28 |
| _______________________ |
|
| Column type frequency: |
|
| character |
3 |
| numeric |
25 |
| ________________________ |
|
| Group variables |
None |
Variable type: character
| player |
0 |
1 |
7 |
24 |
0 |
481 |
0 |
| nba_country |
0 |
1 |
3 |
16 |
0 |
44 |
0 |
| tm |
0 |
1 |
3 |
3 |
0 |
31 |
0 |
Variable type: numeric
| salary |
0 |
1 |
6682859.45 |
7405536.17 |
46080.00 |
1471382.00 |
3290000.00 |
1.00e+07 |
34682550.00 |
▇▂▁▁▁ |
| nba_draft_number |
0 |
1 |
29.29 |
21.10 |
1.00 |
10.00 |
24.00 |
4.70e+01 |
62.00 |
▇▅▃▃▆ |
| age |
0 |
1 |
26.29 |
4.27 |
19.00 |
23.00 |
26.00 |
2.90e+01 |
41.00 |
▇▇▆▂▁ |
| g |
0 |
1 |
50.52 |
24.67 |
1.00 |
30.00 |
59.00 |
7.10e+01 |
79.00 |
▃▂▂▃▇ |
| mp |
0 |
1 |
1163.17 |
808.61 |
1.00 |
391.00 |
1155.00 |
1.83e+03 |
2898.00 |
▇▅▆▅▂ |
| per |
0 |
1 |
13.36 |
8.74 |
-41.10 |
9.90 |
13.30 |
1.66e+01 |
134.10 |
▁▇▁▁▁ |
| ts |
0 |
1 |
0.54 |
0.11 |
0.00 |
0.51 |
0.54 |
5.80e-01 |
1.50 |
▁▇▂▁▁ |
| x3p_ar |
0 |
1 |
0.34 |
0.23 |
0.00 |
0.17 |
0.35 |
4.80e-01 |
1.00 |
▇▇▇▂▁ |
| f_tr |
0 |
1 |
0.26 |
0.30 |
0.00 |
0.16 |
0.23 |
3.20e-01 |
5.33 |
▇▁▁▁▁ |
| orb |
0 |
1 |
4.91 |
4.58 |
0.00 |
1.80 |
3.30 |
7.10e+00 |
35.90 |
▇▂▁▁▁ |
| drb |
0 |
1 |
15.03 |
6.80 |
0.00 |
10.30 |
14.00 |
1.88e+01 |
37.60 |
▂▇▅▂▁ |
| trb |
0 |
1 |
9.97 |
4.93 |
0.00 |
6.20 |
8.70 |
1.33e+01 |
26.50 |
▂▇▃▂▁ |
| ast |
0 |
1 |
12.96 |
9.09 |
0.00 |
6.90 |
9.90 |
1.72e+01 |
49.40 |
▇▅▂▁▁ |
| stl |
0 |
1 |
1.54 |
0.99 |
0.00 |
1.00 |
1.50 |
1.90e+00 |
12.50 |
▇▁▁▁▁ |
| blk |
0 |
1 |
1.72 |
1.69 |
0.00 |
0.60 |
1.20 |
2.20e+00 |
13.40 |
▇▂▁▁▁ |
| tov |
0 |
1 |
13.12 |
6.12 |
0.00 |
9.90 |
12.50 |
1.56e+01 |
66.70 |
▇▆▁▁▁ |
| usg |
0 |
1 |
18.94 |
5.81 |
5.70 |
15.00 |
17.90 |
2.22e+01 |
45.10 |
▂▇▃▁▁ |
| ows |
0 |
1 |
1.29 |
1.88 |
-2.30 |
0.00 |
0.80 |
2.00e+00 |
11.40 |
▇▇▂▁▁ |
| dws |
0 |
1 |
1.19 |
1.03 |
0.00 |
0.30 |
1.00 |
1.80e+00 |
5.60 |
▇▅▂▁▁ |
| ws |
0 |
1 |
2.48 |
2.67 |
-1.20 |
0.40 |
1.90 |
3.60e+00 |
15.00 |
▇▅▁▁▁ |
| ws_48 |
0 |
1 |
0.08 |
0.16 |
-1.06 |
0.04 |
0.08 |
1.20e-01 |
2.71 |
▁▇▁▁▁ |
| obpm |
0 |
1 |
-1.22 |
5.02 |
-36.50 |
-2.60 |
-1.00 |
4.00e-01 |
68.70 |
▁▇▁▁▁ |
| dbpm |
0 |
1 |
-0.48 |
2.39 |
-14.30 |
-1.60 |
-0.40 |
1.00e+00 |
6.80 |
▁▁▂▇▁ |
| bpm |
0 |
1 |
-1.70 |
5.64 |
-49.20 |
-3.50 |
-1.20 |
6.00e-01 |
54.40 |
▁▁▇▁▁ |
| vorp |
0 |
1 |
0.60 |
1.25 |
-1.30 |
-0.10 |
0.10 |
9.00e-01 |
8.60 |
▇▃▁▁▁ |
raw_data %>%
select_at(vars(-c("player","nba_country","tm"))) %>%
tidyr::gather("id", "value", 2:25) %>%
ggplot(., aes(y=salary, x=value))+
geom_point()+
geom_smooth(method = "lm", se=FALSE, color="black")+
facet_wrap(~id,ncol=2,scales="free_x")
## `geom_smooth()` using formula 'y ~ x'

raw_data %>%
select_at(vars(-c("player","nba_country","tm"))) %>%
tidyr::gather("id", "value", 2:25) %>%
ggplot(., aes(y=log(salary), x=value))+
geom_point()+
geom_smooth(method = "lm", se=FALSE, color="black")+
facet_wrap(~id,ncol=2,scales="free_x")
## `geom_smooth()` using formula 'y ~ x'

Model Selection
nba <- log_data %>% select_at(vars(-vars))
set.seed(1234)
num_data <- nrow(nba)
num_data_test <- 10
train=sample(num_data ,num_data-num_data_test)
data_train <- nba[train,]
data_test <- nba[-train,]
model_select <- regsubsets(salary~. , data =data_train, method = "seqrep",nvmax=24)
model_select_summary <- summary(model_select)
data.frame(
Adj.R2 = (model_select_summary$adjr2),
CP = (model_select_summary$cp),
BIC = (model_select_summary$bic)
)
model_select_summary$outmat
## nba_draft_number age g mp per ts x3p_ar f_tr orb drb trb ast stl
## 1 ( 1 ) " " " " " " "*" " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) "*" "*" "*" " " " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) "*" "*" " " "*" " " " " " " " " " " "*" " " " " " "
## 5 ( 1 ) "*" "*" " " "*" " " " " " " " " " " "*" " " " " " "
## 6 ( 1 ) "*" "*" " " "*" " " " " " " " " " " "*" " " " " " "
## 7 ( 1 ) "*" "*" " " "*" "*" " " " " " " " " " " "*" " " " "
## 8 ( 1 ) "*" "*" " " "*" "*" "*" " " " " " " "*" " " " " " "
## 9 ( 1 ) "*" "*" " " "*" "*" "*" " " " " " " " " "*" " " " "
## 10 ( 1 ) "*" "*" " " "*" "*" "*" "*" " " " " "*" " " " " " "
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" " " " "
## 12 ( 1 ) "*" "*" " " "*" "*" "*" "*" " " " " "*" " " " " " "
## 13 ( 1 ) "*" "*" "*" "*" "*" "*" " " " " " " " " "*" "*" " "
## 14 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" " " " " "*" "*" " "
## 15 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 16 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" " " " " "*" "*" " "
## 17 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" " "
## 18 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" " "
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 20 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*"
## 21 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" "*"
## 22 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 23 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## 24 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
## blk tov usg ows dws ws ws_48 obpm dbpm bpm vorp
## 1 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " " " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " " " " " " " " " "*" " " " "
## 7 ( 1 ) " " " " "*" " " " " " " " " " " " " "*" " "
## 8 ( 1 ) " " " " "*" " " " " " " " " " " " " "*" " "
## 9 ( 1 ) " " " " "*" " " " " " " "*" " " " " "*" " "
## 10 ( 1 ) " " "*" "*" " " " " " " " " " " " " "*" " "
## 11 ( 1 ) " " " " " " " " " " " " " " " " " " " " " "
## 12 ( 1 ) " " "*" "*" " " "*" " " "*" " " " " "*" " "
## 13 ( 1 ) " " "*" "*" " " "*" " " "*" " " "*" " " " "
## 14 ( 1 ) " " "*" "*" " " "*" " " "*" " " "*" " " " "
## 15 ( 1 ) "*" "*" " " " " " " " " " " " " " " " " " "
## 16 ( 1 ) " " "*" "*" " " "*" " " "*" " " "*" " " "*"
## 17 ( 1 ) " " "*" "*" " " "*" " " "*" " " "*" " " "*"
## 18 ( 1 ) " " "*" "*" " " "*" " " "*" "*" " " "*" "*"
## 19 ( 1 ) "*" "*" "*" "*" "*" "*" " " " " " " " " " "
## 20 ( 1 ) "*" "*" "*" "*" "*" "*" "*" " " " " "*" " "
## 21 ( 1 ) "*" "*" "*" " " "*" "*" "*" "*" "*" "*" " "
## 22 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" " " " "
## 23 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" " "
## 24 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" "*"
plot(model_select, scale = "bic", main = "BIC")

data.frame(
Adj.R2 = which.max(model_select_summary$adjr2),
CP = which.min(model_select_summary$cp),
BIC = which.min(model_select_summary$bic)
)
coef(model_select,which.min(model_select_summary$adjr2))
## (Intercept) mp
## 13.729582488 0.001058354
coef(model_select,which.min(model_select_summary$cp))
## (Intercept) nba_draft_number age g
## 10.506881965 -0.022087641 0.099108353 -0.005630508
## mp per ts trb
## 0.001079432 -0.158889718 2.959813795 0.063559521
## ast tov usg dws
## 0.016280400 -0.021765149 0.072773966 -0.241306723
## ws_48 dbpm
## 5.659374488 0.114845569
coef(model_select,which.min(model_select_summary$bic))
## (Intercept) nba_draft_number age mp
## 11.7177685858 -0.0226718307 0.0995664958 0.0007887849
## drb
## 0.0245851796
“All models are wrong, some models are useful”, Box, G.E.P
# adjR2 model
nba_r2 <- lm(salary~ mp , data =data_train)
summary(nba_r2)
##
## Call:
## lm(formula = salary ~ mp, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1998 -0.7103 0.1219 0.7393 3.4727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.373e+01 9.788e-02 140.3 <2e-16 ***
## mp 1.058e-03 6.873e-05 15.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.206 on 469 degrees of freedom
## Multiple R-squared: 0.3358, Adjusted R-squared: 0.3344
## F-statistic: 237.1 on 1 and 469 DF, p-value: < 2.2e-16
# CP model
nba_cp <- lm(salary~ nba_draft_number+age+mp+per+ts+f_tr+trb+ast+tov+usg+dws+ws_48+dbpm, data =data_train)
summary(nba_cp)
##
## Call:
## lm(formula = salary ~ nba_draft_number + age + mp + per + ts +
## f_tr + trb + ast + tov + usg + dws + ws_48 + dbpm, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5542 -0.5648 0.0041 0.6197 3.4279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.3750847 0.5164161 20.091 < 2e-16 ***
## nba_draft_number -0.0220209 0.0024562 -8.966 < 2e-16 ***
## age 0.0984591 0.0110879 8.880 < 2e-16 ***
## mp 0.0009087 0.0001215 7.481 3.81e-13 ***
## per -0.1560700 0.0380094 -4.106 4.77e-05 ***
## ts 2.8888794 0.8035310 3.595 0.000359 ***
## f_tr -0.1972043 0.1667041 -1.183 0.237440
## trb 0.0643382 0.0154437 4.166 3.71e-05 ***
## ast 0.0165634 0.0074575 2.221 0.026837 *
## tov -0.0189888 0.0090615 -2.096 0.036672 *
## usg 0.0751283 0.0196126 3.831 0.000146 ***
## dws -0.2223729 0.1080031 -2.059 0.040065 *
## ws_48 5.5865320 1.6908769 3.304 0.001028 **
## dbpm 0.1059941 0.0337250 3.143 0.001782 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.007 on 457 degrees of freedom
## Multiple R-squared: 0.5486, Adjusted R-squared: 0.5357
## F-statistic: 42.72 on 13 and 457 DF, p-value: < 2.2e-16
# BIC model
nba_bic <- lm(salary~ nba_draft_number+age+mp+drb, data =data_train)
summary(nba_bic)
##
## Call:
## lm(formula = salary ~ nba_draft_number + age + mp + drb, data = data_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5796 -0.5286 0.0462 0.6092 3.0812
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.172e+01 3.354e-01 34.940 < 2e-16 ***
## nba_draft_number -2.267e-02 2.432e-03 -9.323 < 2e-16 ***
## age 9.957e-02 1.110e-02 8.968 < 2e-16 ***
## mp 7.888e-04 6.284e-05 12.551 < 2e-16 ***
## drb 2.459e-02 7.071e-03 3.477 0.000555 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.024 on 466 degrees of freedom
## Multiple R-squared: 0.5238, Adjusted R-squared: 0.5197
## F-statistic: 128.2 on 4 and 466 DF, p-value: < 2.2e-16
# Prediction
# adjR2
predict_r2 <- predict(nba_r2,newdata = data_test)
cbind(predict_r2,data_test$salary)
## predict_r2
## 8 13.85447 10.73813
## 27 15.43036 15.00640
## 32 14.67893 13.61170
## 44 14.80910 16.55332
## 144 14.98902 14.20171
## 183 14.07249 14.22401
## 434 13.83436 14.71210
## 436 13.83330 14.10156
## 457 13.82801 11.43128
## 480 15.67484 15.52106
exp(cbind(predict_r2,data_test$salary))
## predict_r2
## 8 1039726 46080
## 27 5027115 3290000
## 32 2371245 815615
## 44 2700920 15453126
## 144 3233331 1471382
## 183 1293017 1504560
## 434 1019028 2451225
## 436 1017950 1331160
## 457 1012577 92160
## 480 6419408 5504420
mean((data_test$salary-predict_r2)^2)
## [1] 2.132571
sqrt(mean((data_test$salary-predict_r2)^2))
## [1] 1.460332
# CP
predict_cp <- predict(nba_cp,newdata = data_test)
cbind(predict_cp,data_test$salary)
## predict_cp
## 8 12.75007 10.73813
## 27 16.19503 15.00640
## 32 14.00431 13.61170
## 44 15.56626 16.55332
## 144 14.56405 14.20171
## 183 13.76261 14.22401
## 434 13.75832 14.71210
## 436 14.07144 14.10156
## 457 12.97999 11.43128
## 480 15.72969 15.52106
exp(cbind(predict_cp,data_test$salary))
## predict_cp
## 8 344577.7 46080
## 27 10799662.8 3290000
## 32 1207803.7 815615
## 44 5758891.0 15453126
## 144 2113916.5 1471382
## 183 948474.6 1504560
## 434 944415.7 2451225
## 436 1291664.2 1331160
## 457 433648.9 92160
## 480 6781374.8 5504420
mean((data_test$salary-predict_cp)^2)
## [1] 1.028599
sqrt(mean((data_test$salary-predict_cp)^2))
## [1] 1.014199
# BIC
predict_bic <- predict(nba_bic,newdata = data_test)
cbind(predict_bic,data_test$salary)
## predict_bic
## 8 12.81815 10.73813
## 27 16.08326 15.00640
## 32 13.91118 13.61170
## 44 15.60099 16.55332
## 144 14.74093 14.20171
## 183 13.92808 14.22401
## 434 13.91641 14.71210
## 436 14.14277 14.10156
## 457 12.97544 11.43128
## 480 15.80329 15.52106
exp(cbind(predict_bic,data_test$salary))
## predict_bic
## 8 368850.6 46080
## 27 9657631.3 3290000
## 32 1100400.1 815615
## 44 5962422.9 15453126
## 144 2522917.6 1471382
## 183 1119148.8 1504560
## 434 1106169.2 2451225
## 436 1387156.7 1331160
## 457 431680.4 92160
## 480 7299291.2 5504420
mean((data_test$salary-predict_bic)^2)
## [1] 0.9959934
sqrt(mean((data_test$salary-predict_bic)^2))
## [1] 0.9979947